{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Multi-Turn Chat Benchmark\n",
    "\n",
    "```bash\n",
    "gcloud container clusters create-auto cluster-1 \\\n",
    "    --location=us-central1\n",
    "\n",
    "helm repo add kubeai https://www.kubeai.org\n",
    "helm repo update\n",
    "curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml\n",
    "helm upgrade --install kubeai kubeai/kubeai \\\n",
    "    -f values-gke.yaml \\\n",
    "    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n",
    "    --set metrics.prometheusOperator.vLLMPodMonitor.enabled=true \\\n",
    "    --set open-webui.enabled=false \\\n",
    "    --wait\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from subprocess import run, PIPE\n",
    "import json\n",
    "from kubernetes import config, dynamic\n",
    "from kubernetes.client import api_client\n",
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k8s_client = dynamic.DynamicClient(\n",
    "    api_client.ApiClient(configuration=config.load_kube_config())\n",
    ")\n",
    "models_client = k8s_client.resources.get(api_version=\"kubeai.org/v1\", kind=\"Model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "#base_model = {\n",
    "#    \"apiVersion\": \"kubeai.org/v1\",\n",
    "#    \"kind\": \"Model\",\n",
    "#    \"metadata\": {\n",
    "#        \"name\": \"bench\",\n",
    "#        \"namespace\": \"default\",\n",
    "#    },\n",
    "#    \"spec\": {\n",
    "#        \"features\": [\"TextGeneration\"],\n",
    "#        \"url\": \"ollama://qwen2:0.5b\",\n",
    "#        \"engine\": \"OLlama\",\n",
    "#        \"resourceProfile\": \"cpu:2\"\n",
    "#    },\n",
    "#}\n",
    "base_model = {\n",
    "    \"apiVersion\": \"kubeai.org/v1\",\n",
    "    \"kind\": \"Model\",\n",
    "    \"metadata\": {\n",
    "        \"name\": \"bench\",\n",
    "        \"namespace\": \"default\",\n",
    "    },\n",
    "    \"spec\": {\n",
    "        \"features\": [\"TextGeneration\"],\n",
    "        \"url\": \"hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8\",\n",
    "        \"engine\": \"VLLM\",\n",
    "        \"args\": [\n",
    "            \"--enable-prefix-caching\",\n",
    "            \"--max-model-len=16384\",\n",
    "            \"--max-num-batched-token=16384\",\n",
    "            \"--gpu-memory-utilization=0.8\",\n",
    "            \"--disable-log-requests\"\n",
    "        ],\n",
    "        \"resourceProfile\": \"nvidia-gpu-l4:1\"\n",
    "    },\n",
    "\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "benches = [\n",
    "    {\n",
    "        \"thread_count\": 2000,\n",
    "        \"max_concurrent_threads\": 300,\n",
    "    },\n",
    "]\n",
    "specs = [\n",
    "    {\n",
    "        \"minReplicas\": 2,\n",
    "        \"maxReplicas\": 2,\n",
    "        \"loadBalancing\": {\n",
    "            \"strategy\": \"PrefixHash\",\n",
    "            \"prefixHash\": {\n",
    "                \"meanLoadFactor\": 115,\n",
    "                \"prefixCharLength\": 100,\n",
    "                \"replication\": 1000,\n",
    "            },\n",
    "        },\n",
    "    },\n",
    "    {\n",
    "        \"minReplicas\": 2,\n",
    "        \"maxReplicas\": 2,\n",
    "        \"loadBalancing\": {\n",
    "            \"strategy\": \"LeastLoad\",\n",
    "            \"prefixHash\": {\n",
    "                \"meanLoadFactor\": 115,\n",
    "                \"prefixCharLength\": 100,\n",
    "                \"replication\": 1000,\n",
    "            },\n",
    "        },\n",
    "    },\n",
    "] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'thread_count': 2000, 'max_concurrent_threads': 300}\n",
      "pod/bench created\n",
      "pod/bench condition met\n",
      "model.kubeai.org/bench condition met\n",
      "kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=2000 --max-concurrent-threads=300 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/02/24 03:29:15 Shuffling dataset threads\n",
      "2025/02/24 03:29:15 Trimming dataset threads (9204) to specified thread count (2000)\n",
      "2025/02/24 03:29:15 Starting run...\n",
      "2025/02/24 03:36:18 Run completed, starting summarization...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_thread_count': 2000, 'input_messages_per_thread': {'mean': 7.441}, 'duration': '7m3.113770224s', 'request_count': 14882, 'request_duration': {'mean': '7.689761059s'}, 'chunks_per_request': {'mean': 38.956255879586074}, 'failed_threads': 0, 'run_output_throughput': 1370.4540972349312, 'run_total_throughput': 15091.186932109475, 'ttft': {'mean': '2.133812223s'}, 'itl': {'mean': '141.652458ms'}, 'prompt_tokens': 5805431, 'cached_prompt_tokens': 0, 'completion_tokens': 579858, 'total_tokens': 6385289}\n",
      "pod \"bench\" deleted\n",
      "{'thread_count': 2000, 'max_concurrent_threads': 300}\n",
      "pod/bench created\n",
      "pod/bench condition met\n",
      "model.kubeai.org/bench condition met\n",
      "kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count=2000 --max-concurrent-threads=300 --request-model=bench --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/02/24 03:38:05 Shuffling dataset threads\n",
      "2025/02/24 03:38:05 Trimming dataset threads (9204) to specified thread count (2000)\n",
      "2025/02/24 03:38:05 Starting run...\n",
      "2025/02/24 03:44:45 Run completed, starting summarization...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_thread_count': 2000, 'input_messages_per_thread': {'mean': 7.441}, 'duration': '6m39.71940859s', 'request_count': 14882, 'request_duration': {'mean': '7.317937037s'}, 'chunks_per_request': {'mean': 38.85230479774224}, 'failed_threads': 0, 'run_output_throughput': 1446.6898218423585, 'run_total_throughput': 15945.045106722522, 'ttft': {'mean': '1.790251177s'}, 'itl': {'mean': '140.861199ms'}, 'prompt_tokens': 5795274, 'cached_prompt_tokens': 0, 'completion_tokens': 578270, 'total_tokens': 6373544}\n",
      "pod \"bench\" deleted\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'spec': {'minReplicas': 2,\n",
       "   'maxReplicas': 2,\n",
       "   'loadBalancing': {'strategy': 'PrefixHash',\n",
       "    'prefixHash': {'meanLoadFactor': 115,\n",
       "     'prefixCharLength': 100,\n",
       "     'replication': 1000}}},\n",
       "  'bench': {'thread_count': 2000, 'max_concurrent_threads': 300},\n",
       "  'result': {'input_thread_count': 2000,\n",
       "   'input_messages_per_thread': {'mean': 7.441},\n",
       "   'duration': '7m3.113770224s',\n",
       "   'request_count': 14882,\n",
       "   'request_duration': {'mean': '7.689761059s'},\n",
       "   'chunks_per_request': {'mean': 38.956255879586074},\n",
       "   'failed_threads': 0,\n",
       "   'run_output_throughput': 1370.4540972349312,\n",
       "   'run_total_throughput': 15091.186932109475,\n",
       "   'ttft': {'mean': '2.133812223s'},\n",
       "   'itl': {'mean': '141.652458ms'},\n",
       "   'prompt_tokens': 5805431,\n",
       "   'cached_prompt_tokens': 0,\n",
       "   'completion_tokens': 579858,\n",
       "   'total_tokens': 6385289}},\n",
       " {'spec': {'minReplicas': 2,\n",
       "   'maxReplicas': 2,\n",
       "   'loadBalancing': {'strategy': 'LeastLoad',\n",
       "    'prefixHash': {'meanLoadFactor': 115,\n",
       "     'prefixCharLength': 100,\n",
       "     'replication': 1000}}},\n",
       "  'bench': {'thread_count': 2000, 'max_concurrent_threads': 300},\n",
       "  'result': {'input_thread_count': 2000,\n",
       "   'input_messages_per_thread': {'mean': 7.441},\n",
       "   'duration': '6m39.71940859s',\n",
       "   'request_count': 14882,\n",
       "   'request_duration': {'mean': '7.317937037s'},\n",
       "   'chunks_per_request': {'mean': 38.85230479774224},\n",
       "   'failed_threads': 0,\n",
       "   'run_output_throughput': 1446.6898218423585,\n",
       "   'run_total_throughput': 15945.045106722522,\n",
       "   'ttft': {'mean': '1.790251177s'},\n",
       "   'itl': {'mean': '140.861199ms'},\n",
       "   'prompt_tokens': 5795274,\n",
       "   'cached_prompt_tokens': 0,\n",
       "   'completion_tokens': 578270,\n",
       "   'total_tokens': 6373544}}]"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_results = []\n",
    "\n",
    "i = 0\n",
    "for spec in specs:\n",
    "    for  bench in benches:\n",
    "        print(bench)\n",
    "        try:\n",
    "            # Start a fresh instance of the benchmark Pod.\n",
    "            !kubectl apply -f ./bench-pod.yaml\n",
    "            !kubectl wait --timeout 10m --for=condition=Ready -f ./bench-pod.yaml\n",
    "\n",
    "            model = deepcopy(base_model)\n",
    "            model[\"spec\"].update(spec)\n",
    "            #model[\"metadata\"][\"name\"] = model[\"metadata\"][\"name\"] + f'-{i}'\n",
    "            #models_client.create(body=model)\n",
    "            models_client.patch(\n",
    "                body=model,\n",
    "                content_type=\"application/apply-patch+yaml\",\n",
    "                field_manager=\"benchmark\",\n",
    "            )\n",
    "\n",
    "\n",
    "            model_name = model.get(\"metadata\").get(\"name\")\n",
    "            model_replicas = model.get(\"spec\").get(\"minReplicas\")\n",
    "\n",
    "            !kubectl wait --timeout 30m --for=jsonpath='.status.replicas.ready'={model_replicas} model/{model_name}\n",
    "\n",
    "            thread_count = bench.get(\"thread_count\")\n",
    "            max_concurrent_threads = bench.get(\"max_concurrent_threads\")\n",
    "            cmd = f'kubectl exec bench -- bench --threads=./data/large-exact.json --thread-count={thread_count} --max-concurrent-threads={max_concurrent_threads} --request-model={model_name} --max-completion-tokens=40 --request-timeout=2m --seed=2 --format=json'\n",
    "            print(cmd)\n",
    "\n",
    "            output = run(cmd, shell=True, stdout=PIPE, encoding='utf8')\n",
    "            result = json.loads(output.stdout)\n",
    "            print(result)\n",
    "            all_results.append({\n",
    "                \"spec\": spec,\n",
    "                \"bench\": bench,\n",
    "                \"result\": result\n",
    "            }) \n",
    "        finally:\n",
    "            models_client.delete(name=model_name, namespace=\"default\")\n",
    "            !kubectl delete --now -f ./bench-pod.yaml\n",
    "            i+=1\n",
    "\n",
    "all_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Strategy PrefixHash: TTFT=2.133812223s ITL=141.652458ms TPS(total)=15091.186932109475\n",
      "Strategy LeastLoad: TTFT=1.790251177s ITL=140.861199ms TPS(total)=15945.045106722522\n"
     ]
    }
   ],
   "source": [
    "for r in all_results:\n",
    "    print(f'Strategy {r[\"spec\"][\"loadBalancing\"][\"strategy\"]}: TTFT={r[\"result\"][\"ttft\"][\"mean\"]} ITL={r[\"result\"][\"itl\"][\"mean\"]} TPS(total)={r[\"result\"][\"run_total_throughput\"]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
